{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "7e018586-ce85-4623-b381-104e9de43faf",
   "metadata": {},
   "source": [
    "### 安装"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6d56061f-2bf7-4abf-9f52-1df0f5a66f9d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple\n",
      "Requirement already satisfied: jieba in d:\\developer\\python396\\lib\\site-packages (0.42.1)\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING: You are using pip version 21.1.3; however, version 24.1.1 is available.\n",
      "You should consider upgrading via the 'd:\\developer\\python396\\python.exe -m pip install --upgrade pip' command.\n"
     ]
    }
   ],
   "source": [
    "pip install jieba -i https://pypi.tuna.tsinghua.edu.cn/simple"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "03cfa6a2-5dd5-4b03-b7e1-eea6f951ff96",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['d:\\\\developer\\\\python396\\\\lib\\\\site-packages\\\\jieba']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import jieba\n",
    "jieba.__path__"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6c8e732e-1091-4302-99ae-bde893422beb",
   "metadata": {},
   "source": [
    "### 结巴分词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0cac1ea5-e28f-47e7-b621-3deae28621e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "#精确模式: 把句子最精确的切分开, 比较适合文本分析. 默认精确模式.\n",
    "#全模式: 把句子中所有可能成词的词都扫描出来, cut_all = True, 缺点: 速度快, 不能解决歧义\n",
    "#paddle: 利用百度的paddlepaddle深度学习框架. 简单来说就是使用百度提供的分词模型. use_paddle=True.\n",
    "#搜索引擎模式: 在精确模式的基础上, 对长词再进行切分, 提高召回率, jieba.cut_for_search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "3cf7aa6c-7009-4a37-8ea5-bd7ed0e8abcc",
   "metadata": {},
   "outputs": [],
   "source": [
    "text = '我爱中华人民共和国'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "5762a9f6-7842-4cd4-8c6b-49ef2092cfc0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['我', '爱', '中华人民共和国']"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#精确模式\n",
    "gen = jieba.cut(text)  #返回的生成器\n",
    "#for word in gen:\n",
    "#    print(word)\n",
    "list(gen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "f3b4ed14-64ea-42ce-94dd-7068742fdf7a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['我', '爱', '中华人民共和国']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#或者使用lcut直接放回列表\n",
    "jieba.lcut(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "6269ee83-3ef6-4c59-9a14-2abeaac1d533",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['我', '爱', '中华', '中华人民', '中华人民共和国', '华人', '人民', '人民共和国', '共和', '共和国']"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#全模式\n",
    "jieba.lcut(text, cut_all=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "eba31b4a-0c7d-4695-b547-20433b507376",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['我', '爱', '中华', '华人', '人民', '共和', '共和国', '中华人民共和国']"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#搜索引擎模式\n",
    "jieba.lcut_for_search(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cdc1a189-1d3e-4071-82e5-5d3160b4a642",
   "metadata": {},
   "source": [
    "### 词性标注"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "c83c5c50-5853-4d3b-bdc3-d1e821349e9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from jieba import posseg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "dad28153-3dab-434e-9317-cdc8583bde4c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[pair('我', 'r'), pair('爱', 'v'), pair('中华人民共和国', 'ns')]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "posseg.lcut(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6c66502a-fb29-4a02-a5de-980a0b18cac2",
   "metadata": {},
   "source": [
    "### 自定义词典"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "6009a434-dd07-488e-9d47-c7a6742b4a31",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['马', '士兵', '教育', '是', '一个', '线上', '培训', '机构']"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text = '马士兵教育是一个线上培训机构'\n",
    "jieba.lcut(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "309a00af-1f8a-4842-a2d4-890471258e17",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['马', '士兵', '教育', '是', '一个', '线上', '培训', '机构']"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text = '马士兵教育是一个线上培训机构'\n",
    "jieba.load_userdict('D:\\developer\\Python396\\Lib\\site-packages\\jieba\\dict2.txt')\n",
    "jieba.lcut(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "c24d3865-f816-478a-a929-8c6c193c4ecd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['云', '计算']"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#直接加词\n",
    "text = '云计算'\n",
    "jieba.lcut(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "a561e462-3867-4998-aec5-a451b4b7bf94",
   "metadata": {},
   "outputs": [],
   "source": [
    "jieba.add_word('云计算')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "193aba80-9b11-4641-8727-a26d0d8b7c61",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['云计算']"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "jieba.lcut(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "55d6a1b7-37b5-48ec-8377-9d4e0526a578",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['云计算', '计算']"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "jieba.lcut(text, cut_all=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "05713064-5472-45f8-b7e5-02912df39be7",
   "metadata": {},
   "source": [
    "### 关键词提取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "88b50b1b-93e4-406e-a55a-f3decf081ea7",
   "metadata": {},
   "outputs": [],
   "source": [
    "from jieba import analyse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "b7bc90ac-4234-45c1-be7e-4cff5d571b03",
   "metadata": {},
   "outputs": [],
   "source": [
    "text = '8月5日，中国人民解放军东部战区海军继续在台岛周边海域展开实战化训练。央视新闻发布从我舰拍摄的台舰近景，并配了一段唢呐，蔡英文听懂了吗？'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "7ef01442-b8f6-4506-94cb-3ac1ae0779fc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('台舰', 0.5433985228590908),\n",
       " ('并配', 0.5433985228590908),\n",
       " ('台岛', 0.5319751397545455),\n",
       " ('近景', 0.5087557932227272),\n",
       " ('唢呐', 0.42631313976954543),\n",
       " ('听懂', 0.40289659316181814),\n",
       " ('实战', 0.37654120701409094),\n",
       " ('战区', 0.37588481227954545),\n",
       " ('央视', 0.3710956979318182),\n",
       " ('中国人民解放军', 0.35365364695545454)]"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 提取关键词, 其实就是根据tfidf算出来的值\n",
    "analyse.extract_tags(text, topK=10, withWeight=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "c4d23698-3126-4f45-bf42-74e5bc1cac0a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('台舰', 0.5433985228590908),\n",
       " ('并配', 0.5433985228590908),\n",
       " ('台岛', 0.5319751397545455),\n",
       " ('近景', 0.5087557932227272),\n",
       " ('唢呐', 0.42631313976954543),\n",
       " ('听懂', 0.40289659316181814),\n",
       " ('实战', 0.37654120701409094),\n",
       " ('战区', 0.37588481227954545),\n",
       " ('央视', 0.3710956979318182),\n",
       " ('中国人民解放军', 0.35365364695545454)]"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "analyse.tfidf(text, topK=10, withWeight=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "227c8676-0360-4a55-bd96-e1d7a3ff561f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('8', 0, 1)\n",
      "('月', 1, 2)\n",
      "('5', 2, 3)\n",
      "('日', 3, 4)\n",
      "('，', 4, 5)\n",
      "('中国人民解放军', 5, 12)\n",
      "('东部', 12, 14)\n",
      "('战区', 14, 16)\n",
      "('海军', 16, 18)\n",
      "('继续', 18, 20)\n",
      "('在', 20, 21)\n",
      "('台岛', 21, 23)\n",
      "('周边', 23, 25)\n",
      "('海域', 25, 27)\n",
      "('展开', 27, 29)\n",
      "('实战', 29, 31)\n",
      "('化', 31, 32)\n",
      "('训练', 32, 34)\n",
      "('。', 34, 35)\n",
      "('央视', 35, 37)\n",
      "('新闻', 37, 39)\n",
      "('发布', 39, 41)\n",
      "('从', 41, 42)\n",
      "('我', 42, 43)\n",
      "('舰', 43, 44)\n",
      "('拍摄', 44, 46)\n",
      "('的', 46, 47)\n",
      "('台舰', 47, 49)\n",
      "('近景', 49, 51)\n",
      "('，', 51, 52)\n",
      "('并配', 52, 54)\n",
      "('了', 54, 55)\n",
      "('一段', 55, 57)\n",
      "('唢呐', 57, 59)\n",
      "('，', 59, 60)\n",
      "('蔡', 60, 61)\n",
      "('英文', 61, 63)\n",
      "('听懂', 63, 65)\n",
      "('了', 65, 66)\n",
      "('吗', 66, 67)\n",
      "('？', 67, 68)\n"
     ]
    }
   ],
   "source": [
    "gen = jieba.tokenize(text)\n",
    "for g in gen:\n",
    "    print(g)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "86c9ce8e-47f4-45cf-ab75-f0fefe173f2a",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
